In [1]:
import datetime
import numpy as np
import numpy as np
np.random.seed(42)
import tensorflow as tf
tf.set_random_seed(42)
import pandas as pd
import keras.backend as K
from sklearn import preprocessing
from sklearn.metrics import r2_score
from sklearn.feature_selection import VarianceThreshold
from keras.models import Sequential
from keras.layers.core import Dense, Dropout
from keras.layers.recurrent import SimpleRNN, LSTM
from keras.layers.wrappers import TimeDistributed, Bidirectional
from keras.optimizers import RMSprop
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import load_model
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
plt.style.use("ggplot")
%matplotlib inline
In [2]:
train = pd.read_csv("data/train.csv", index_col="ID")
test = pd.read_csv("data/test.csv", index_col="ID")
In [3]:
train["data"] = "train"
test["data"] = "test"
combined_data = pd.concat([train, test])
encoded = pd.get_dummies(combined_data[["X0", "X1", "X2", "X3", "X4", "X5", "X6", "X8"]])
drop_cat = combined_data.drop(["X0", "X1", "X2", "X3", "X4", "X5", "X6", "X8"], axis=1)
combined_data_clean = drop_cat.join(encoded)
In [4]:
train_data = combined_data_clean[combined_data_clean.data == "train"].copy()
test_data = combined_data_clean[combined_data_clean.data == "test"].copy()
train_data.drop("data", axis=1, inplace=True)
test_data.drop(["data", "y"], axis=1, inplace=True)
In [5]:
train_data.columns
Out[5]:
In [6]:
test_data.columns
Out[6]:
In [8]:
y_train = train_data["y"].astype(np.float32)
x_train = train_data.drop("y", axis=1).astype(np.float32)
x_test = test_data.astype(np.float32)
In [12]:
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
sel.fit(x_train)
x_train = sel.transform(x_train)
x_test = sel.transform(x_test)
In [13]:
x_train.shape
Out[13]:
In [14]:
train_reshaped = np.array([i.reshape((-1, 1)) for i in x_train])
train_reshaped = train_reshaped.astype(np.float32)
train_reshaped.shape
Out[14]:
In [15]:
def r2_keras(y_true, y_pred):
SS_res = K.sum(K.square( y_true-y_pred ))
SS_tot = K.sum(K.square( y_true - K.mean(y_true) ) )
return ( 1 - SS_res/(SS_tot + K.epsilon()) )
In [16]:
rmsprop = RMSprop(lr=0.0001, rho=0.9, epsilon=1e-08, decay=0.0)
In [17]:
# Idea: Simple model
model = Sequential()
model.add(Bidirectional(SimpleRNN(128, return_sequences=True, activation="relu"), input_shape=(None, 1)))
model.add(Bidirectional(SimpleRNN(64, return_sequences=True, activation="relu")))
model.add(Bidirectional(SimpleRNN(32, return_sequences=False, activation="relu")))
model.add(Dense(1, activation="linear"))
model.compile(optimizer=rmsprop, loss="mse", metrics=[r2_keras])
model.summary()
In [29]:
# Idea: Funnel -> reduce information after each layer / deep model
model = Sequential()
model.add(Bidirectional(SimpleRNN(128, return_sequences=True, activation="relu"), input_shape=(None, 1)))
model.add(Bidirectional(SimpleRNN(128, return_sequences=True, activation="relu")))
model.add(TimeDistributed(Dense(64, activation="relu")))
model.add(Bidirectional(SimpleRNN(64, return_sequences=True, activation="relu")))
model.add(Bidirectional(SimpleRNN(64, return_sequences=True, activation="relu")))
model.add(TimeDistributed(Dense(32, activation="relu")))
model.add(Bidirectional(SimpleRNN(32, return_sequences=False, activation="relu")))
model.add(Dropout(0.8))
model.add(Dense(1, activation="linear"))
model.compile(optimizer=rmsprop, loss="mse", metrics=[r2_keras])
model.summary()
In [ ]:
early_stop = EarlyStopping(monitor="loss", patience=10)
file_path = "test/weights.{epoch:02d}-{val_loss:.2f}-{val_r2_keras:.2f}.hdf5"
checkpoint = ModelCheckpoint(file_path)
model_run = model.fit(train_reshaped, y_train, epochs=500, batch_size=128, validation_split=0.02, callbacks=[checkpoint])
In [ ]:
y_pred_train = model.predict(train_reshaped)
In [ ]:
print("the R2 score is : {}".format(r2_score(y_train, y_pred_train)))
In [20]:
model_loaded = load_model("test/weights.131-57.91-0.59.hdf5", custom_objects={"r2_keras": r2_keras})
In [21]:
y_pred_train = model_loaded.predict(train_reshaped)
In [22]:
print("the R2 score is : {}".format(r2_score(y_train, y_pred_train)))
In [ ]:
In [23]:
test_reshaped = np.array([i.reshape((-1, 1)) for i in x_test])
test_reshaped = test_reshaped.astype(np.float32)
test_reshaped.shape
Out[23]:
In [24]:
y_pred_test = model_loaded.predict(test_reshaped)
In [25]:
output = pd.DataFrame({"ID": test.index, "y": y_pred_test.reshape(-1)})
In [26]:
output.head()
Out[26]:
In [27]:
output.to_csv("submissions_{}.csv".format(datetime.datetime.today()), index=False)
In [ ]:
In [28]:
output.y.hist(bins=100)
Out[28]:
In [ ]:
In [ ]:
sub_1 = pd.read_csv("submission_baseLine.csv")
In [ ]:
sub_2 = pd.read_csv("submissions_2017-05-31 15:48:40.546392.csv")
In [ ]:
sub_3 = output.copy()
In [ ]:
mean_pred = (sub_1.y.values + sub_2.y.values + sub_3.y.values) / 3
In [ ]:
output_mean = pd.DataFrame({"ID": test.index, "y": mean_pred})
In [ ]:
output_mean.to_csv("submissions_mean_{}.csv".format(datetime.datetime.today()), index=False)
In [ ]:
In [ ]:
sub_1 = pd.read_csv("submission_baseLine.csv")
In [ ]:
sub_2 = pd.read_csv("submissions_2017-05-31 15:48:40.546392.csv")
In [ ]:
mean_pred = (sub_1.y.values + sub_2.y.values ) / 2
In [ ]:
output_mean = pd.DataFrame({"ID": test.index, "y": mean_pred})
In [ ]:
output_mean.to_csv("submissions_mean_2_{}.csv".format(datetime.datetime.today()), index=False)
In [ ]:
In [ ]: